Run Libraries

rm(list=ls())

library(pheatmap)
library(ggplot2)
library(dichromat)
library(dplyr)
library(ggrepel)
library(reshape2)
library(umap)
library(ggthemes)
library(cowplot)
library(DEP)
library(readr)
library(naniar)
library(SummarizedExperiment)
library(data.table)
library(readxl)
library(writexl)
library(stringr)
library(vsn)
library(rmarkdown)
library(tidyr)
library(Polychrome)
library(RColorBrewer)
library(skimr)
library(rstatix)
# install.packages("devtools")
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# options(repos = c(
#     CRAN = "https://cloud.r-project.org/",
#     BiocManager::repositories()))
# ## Install dependencies and package
# devtools::install_github(
#     "BioinfOMICS/LipidSigR",
#     build_vignettes = TRUE, dependencies = TRUE)
# BiocManager::install(
#     c('fgsea', 'gatom', 'mixOmics', 'S4Vectors', 'BiocGenerics',
#       'SummarizedExperiment', 'rgoslin'))
# install.packages(
#     c('devtools', 'magrittr', 'plotly', 'tidyverse', 'factoextra', 'ggthemes',
#       'ggforce', 'Hmisc', 'hwordcloud', 'heatmaply', 'iheatmapr', 'Rtsne', 'uwot',
#       'wordcloud', 'rsample', 'ranger', 'caret', 'yardstick', 'fastshap',
#       'SHAPforxgboost', 'visNetwork', 'tidygraph', 'ggraph'))
# devtools::install_github("ctlab/mwcsr")

Set working directories

knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())

# create directory for results
dir.create(file.path(getwd(),'results'), showWarnings = FALSE)
# create directory for plots
dir.create(file.path(getwd(),'plots'), showWarnings = FALSE)

Load data

Loading data including:

  • All templates: information on the tube ID, rack, visit, country, type of fluid, position
  • Shipment of all boxes file: information on the patient ID, visit, position
  • Raw data: lipidomics expression with info on the tube ID for CSF and Plasma

Step 1: merge info of all patients

The first step is to merge the information of the “All templates” and “Shipment of all boxes” files, and create 3 datasets (one for plasma, one for CSF, one for urine) with the information of Patient ID, Position, Biofluid, Visit, Tube number, Country, Rack and Box.

- CSF

## sample info - CSF
paged_table(lipidomics_CSF_all)
dim(lipidomics_CSF_all)
## [1] 415   8
lipidomics_CSF_all_summary <- lipidomics_CSF_all %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE),
    total_tubes = n_distinct(Tube_number, na.rm = TRUE),
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_CSF_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_CSF_all_V0 = lipidomics_CSF_all %>%
  filter(Visit == "V0")
paged_table(lipidomics_CSF_all_V0 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_CSF_all_V0 %>%
  filter(Patient %in% lipidomics_CSF_all_V0[duplicated(lipidomics_CSF_all_V0$Patient) & !is.na(lipidomics_CSF_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_CSF_all_V1 = lipidomics_CSF_all %>%
  filter(Visit == "V1")
paged_table(lipidomics_CSF_all_V1 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_CSF_all_V1 %>%
  filter(Patient %in% lipidomics_CSF_all_V1[duplicated(lipidomics_CSF_all_V1$Patient) & !is.na(lipidomics_CSF_all_V1$Patient),]$Patient))

- Plasma

## sample info - plasma
paged_table(lipidomics_plasma_all)
dim(lipidomics_plasma_all)
## [1] 415   8
lipidomics_plasma_all_summary <- lipidomics_plasma_all %>%
  group_by(Visit) %>%
   summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE),
    total_tubes = n_distinct(Tube_number, na.rm = TRUE),
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_plasma_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_plasma_all_V0 = lipidomics_plasma_all %>%
  filter(Visit == "V0")
paged_table(lipidomics_plasma_all_V0 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_plasma_all_V0 %>%
  filter(Patient %in% lipidomics_plasma_all_V0[duplicated(lipidomics_plasma_all_V0$Patient) & !is.na(lipidomics_plasma_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_plasma_all_V1 = lipidomics_plasma_all %>%
  filter(Visit == "V1")
paged_table(lipidomics_plasma_all_V1 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_plasma_all_V1 %>%
  filter(Patient %in% lipidomics_plasma_all_V1[duplicated(lipidomics_plasma_all_V1$Patient) & !is.na(lipidomics_plasma_all_V1$Patient),]$Patient))

- Urine

## sample info - urine
paged_table(lipidomics_urine_all)
dim(lipidomics_urine_all)
## [1] 384   8
lipidomics_urine_all_summary <- lipidomics_urine_all %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE),
    total_tubes = n_distinct(Tube_number, na.rm = TRUE),
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_urine_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_urine_all_V0 = lipidomics_urine_all %>%
  filter(Visit == "V0")
paged_table(lipidomics_urine_all_V0 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_urine_all_V0 %>%
  filter(Patient %in% lipidomics_urine_all_V0[duplicated(lipidomics_urine_all_V0$Patient) & !is.na(lipidomics_urine_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_urine_all_V1 = lipidomics_urine_all %>%
  filter(Visit == "V1")
paged_table(lipidomics_urine_all_V1 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_urine_all_V1 %>%
  filter(Patient %in% lipidomics_urine_all_V1[duplicated(lipidomics_urine_all_V1$Patient) & !is.na(lipidomics_urine_all_V1$Patient),]$Patient))

Step 2: aggregate all raw files

The second step is to aggregate all the raw data files together and save the batch information, in order to check later if there are any batch effects in the data.

- CSF

paged_table(raw_data_CSF_final)
dim(raw_data_CSF_final)
## [1] 174 894
# how many rows of QC
length(grep("QC",raw_data_CSF_final$Tube_number))
## [1] 30

- Plasma

paged_table(raw_data_plasma_final)
dim(raw_data_plasma_final)
## [1] 382 894
# how many rows of QC
length(grep("QC",raw_data_plasma_final$Tube_number))
## [1] 66

Overview of data

Step 3: Combine raw data files with patient info

The third step is to combine the raw data files with the patient information, and check the amount of samples per visit for each fluid.

- CSF

raw_data_CSF_IDs = raw_data_CSF_final %>%
  select(Tube_number,Batch)
raw_data_CSF_IDs <- raw_data_CSF_IDs %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
  filter(!str_detect(Tube_number, "QC")) # remove 30 QC IDs

# how many IDs in the raw data CSF
length(raw_data_CSF_IDs$Tube_number)
## [1] 144
# how many unique IDs in the raw data CSF
length(unique(raw_data_CSF_IDs$Tube_number))
## [1] 144
lipidomics_CSF_all <- lipidomics_CSF_all %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_CSF = raw_data_CSF_IDs %>%
  left_join(lipidomics_CSF_all,by = "Tube_number") %>%
  mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_CSF = full_match_CSF %>%
  filter(is.na(match_type)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.y) %>%
  select(-names(lipidomics_CSF_all)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_CSF = no_match_CSF %>%
  left_join(lipidomics_CSF_all %>% 
              filter(!Tube_number %in% (full_match_CSF %>% filter(!is.na(match_type)) %>%
                                          pull(Tube_number))),
            by = "Tube_number4") %>%
  mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))

# final description of the samples available in the raw data without QC samples
combined_data_CSF = full_match_CSF %>%
  filter(!is.na(match_type)) %>%
  select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
  bind_rows(last4_match_CSF %>% 
              select(-Tube_number) %>% 
              left_join(raw_data_CSF_IDs %>% select(Tube_number,Tube_number4) %>%
                          filter(!Tube_number %in% (full_match_CSF %>% filter(!is.na(match_type)) %>%
                                          pull(Tube_number)))) %>%
              select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
  distinct()

paged_table(combined_data_CSF)
# how many IDs in the raw data CSF now 
length(combined_data_CSF$Tube_number)
## [1] 144
# how many unique IDs in the raw data CSF now
length(unique(combined_data_CSF$Tube_number))
## [1] 144
# how many unique Patient IDs
length(unique(na.omit(combined_data_CSF$Patient)))
## [1] 112
# how many tube numbers and patient IDs per V0 and V1
combined_data_CSF %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
    total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# which Tube numbers at V1 do not have a patient ID
combined_data_CSF %>%
  filter(Visit == "V1") %>%
  filter(is.na(Patient))
# patients at V0 and V1
patients_CSF_V0 <- combined_data_CSF %>%
  filter(Visit == "V0") %>%
  pull(Patient) %>%
  unique()
patients_CSF_V1 <- combined_data_CSF %>%
  filter(Visit == "V1") %>%
  pull(Patient) %>%
  unique() %>% 
  na.omit()

# check if all patients at V1 are at V0
all(patients_CSF_V1 %in% patients_CSF_V0)
## [1] FALSE
patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0] # patient TR319
## [1] "TR319"
# check patient in the info_patient dataset
lipidomics_CSF_all[lipidomics_CSF_all$Patient %in% patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0],]

- Plasma

raw_data_plasma_IDs = raw_data_plasma_final %>%
  select(Tube_number,Batch)
raw_data_plasma_IDs <- raw_data_plasma_IDs %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
  filter(!str_detect(Tube_number, "QC")) # remove 66 QC IDs

# how many IDs in the raw data plasma
length(raw_data_plasma_IDs$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma
length(unique(raw_data_plasma_IDs$Tube_number))
## [1] 316
lipidomics_plasma_all <- lipidomics_plasma_all %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_plasma = raw_data_plasma_IDs %>%
  left_join(lipidomics_plasma_all,by = "Tube_number") %>%
  mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_plasma = full_match_plasma %>%
  filter(is.na(match_type)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.y) %>%
  select(-names(lipidomics_plasma_all)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_plasma = no_match_plasma %>%
  left_join(lipidomics_plasma_all,by = "Tube_number4") %>%
  mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))

# final description of the samples available in the raw data without QC samples
combined_data_plasma = full_match_plasma %>%
  filter(!is.na(match_type)) %>%
  select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
  bind_rows(last4_match_plasma %>% 
              select(-Tube_number) %>% 
              left_join(raw_data_plasma_IDs %>% select(Tube_number,Tube_number4)) %>%
              select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
  distinct()

paged_table(combined_data_plasma)
# how many IDs in the raw data plasma now 
length(combined_data_plasma$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma now
length(unique(combined_data_plasma$Tube_number))
## [1] 316
# how many unique Patient IDs
length(unique(na.omit(combined_data_plasma$Patient)))
## [1] 200
# how many tube numbers and patient IDs per V0 and V1
combined_data_plasma %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
    total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# which Tube numbers at V0 do not have a patient ID
combined_data_plasma %>%
  filter(Visit == "V0") %>%
  filter(is.na(Patient))
# which Tube numbers at V1 do not have a patient ID
combined_data_plasma %>%
  filter(Visit == "V1") %>%
  filter(is.na(Patient))
# patients at V0 and V1
patients_plasma_V0 <- combined_data_plasma %>%
  filter(Visit == "V0") %>%
  pull(Patient) %>%
  unique()
patients_plasma_V1 <- combined_data_plasma %>%
  filter(Visit == "V1") %>%
  pull(Patient) %>%
  unique() %>% 
  na.omit()

# check if all patients at V1 are at V0
all(patients_plasma_V1 %in% patients_plasma_V0)
## [1] FALSE
patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0] # patients TR207, TR116 and TR123
## [1] "TR207" "TR116" "TR123"
# check patient in the info_patient dataset
lipidomics_plasma_all[lipidomics_plasma_all$Patient %in% patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0],]

Step 4: Overview of the samples based on status

The fourth step is to give an overview of how many samples of eALS, PGMC, CTR and mimic we have per fluid, visit and country.

- CSF (the patient IDs given by Laura) - not displaying, but it is in code

- CSF (the patient IDs existing in the raw data)

IDs_CSF_data = combined_data_CSF %>%
  select(Patient,Visit,Tube_number) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      grepl("IL", Patient) ~ "Israel",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_CSF_data)
overview_country_CSF = IDs_CSF_data %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_CSF
overview_status_CSF = IDs_CSF_data %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,N.A.) %>%
  arrange(Visit,Country)
overview_status_CSF
# Date info
date_CTR <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "CTR",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "CTR")
date_ALS <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "ALS",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "ALS")
date_mimic <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "mimic",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "mimic")
date_PGMC <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "PGMC",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "PGMC")
date_SYMP <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "SYMP",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "SYMP")
date_NA = data.frame(Pseudonyme = "6H1HR8R7",age = 58,patient_group = "N.A.")

skim(date_CTR)
Data summary
Name date_CTR
Number of rows 20
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 20 0
patient_group 0 1 3 3 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 46.7 9.15 32 39.5 47 53.75 62 ▇▃▇▂▅
skim(date_ALS)
Data summary
Name date_ALS
Number of rows 46
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 46 0
patient_group 0 1 3 3 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 62.52 12.04 29 57 64 70.75 86 ▂▂▇▇▃
skim(date_mimic)
Data summary
Name date_mimic
Number of rows 16
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 16 0
patient_group 0 1 5 5 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 53.25 13.74 23 45.5 57 60.5 71 ▁▂▃▇▅
skim(date_PGMC)
Data summary
Name date_PGMC
Number of rows 21
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 21 0
patient_group 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 41.67 12.8 23 34 38 47 79 ▇▇▃▁▁
skimr::skim(date_NA)
Data summary
Name date_NA
Number of rows 1
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 1 0
patient_group 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 58 NA 58 58 58 58 58 ▁▁▇▁▁
skimr::skim(date_SYMP)
Data summary
Name date_SYMP
Number of rows 8
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 8 0
patient_group 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 66.25 10.71 45 63 66 71.75 82 ▂▁▇▃▂
date_all <- do.call("rbind",list(date_CTR,date_ALS,date_mimic,date_PGMC,date_SYMP,date_NA))

# -> Kruskal Wallis test between samples
kruskal_test(data = date_all,age~patient_group)
dunn_test(data = date_all,age~patient_group)
## Sex analysis
Sex_CTR <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "CTR",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "CTR")
Sex_ALS <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "ALS",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "ALS")
Sex_mimic <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "mimic",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "mimic")
Sex_PGMC <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "PGMC",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "PGMC")
Sex_SYMP <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_data[IDs_CSF_data$type == "SYMP",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "SYMP")
Sex_NA = data.frame(Pseudonyme = "6H1HR8R7",sex = "F",patient_group = "N.A.")

Sex_all <- do.call("rbind",list(Sex_ALS,
                                Sex_CTR,
                                Sex_mimic,
                                Sex_PGMC,
                                Sex_SYMP,
                                Sex_NA))
Sex_all <- Sex_all %>% filter(!is.na(sex))
table(Sex_all$patient_group,Sex_all$sex)
##        
##          F  M
##   ALS   19 27
##   CTR    7 13
##   mimic  4 12
##   N.A.   1  0
##   PGMC   8 13
##   SYMP   2  6
fisher.test(Sex_all$patient_group,Sex_all$sex)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  Sex_all$patient_group and Sex_all$sex
## p-value = 0.6781
## alternative hypothesis: two.sided
pairwise_fisher_test(table(Sex_all$patient_group,Sex_all$sex), p.adjust.method = "holm")

- Plasma (the patient IDs given by Laura) - not displaying, but it is in code

- Plasma (the patient IDs existing in the raw data)

IDs_plasma_data = combined_data_plasma %>%
  select(Patient,Visit,Tube_number) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      grepl("IL", Patient) ~ "Israel",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_plasma_data)
overview_country_plasma = IDs_plasma_data %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_plasma
overview_status_plasma = IDs_plasma_data %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,N.A.) %>%
  arrange(Visit,Country)
overview_status_plasma
# Date info
date_CTR <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "CTR",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "CTR")
date_ALS <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "ALS",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "ALS")
date_mimic <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "mimic",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "mimic")
date_PGMC <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "PGMC",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "PGMC")
date_SYMP <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "SYMP",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "SYMP")
date_NA = data.frame(Pseudonyme = "6H1HR8R7",age = 58,patient_group = "N.A.")

skim(date_CTR)
Data summary
Name date_CTR
Number of rows 47
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 47 0
patient_group 0 1 3 3 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 46.66 11.72 23 39 47 54.5 78 ▃▅▇▅▁
skim(date_ALS)
Data summary
Name date_ALS
Number of rows 65
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 65 0
patient_group 0 1 3 3 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 60.4 12.32 28 56 62 67 86 ▂▂▇▇▂
skim(date_mimic)
Data summary
Name date_mimic
Number of rows 21
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 21 0
patient_group 0 1 5 5 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 54.05 13.49 23 46 58 62 71 ▁▃▃▇▆
skim(date_PGMC)
Data summary
Name date_PGMC
Number of rows 53
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 53 0
patient_group 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 48.87 15.24 22 38 47 57 86 ▃▇▇▂▂
skimr::skim(date_NA)
Data summary
Name date_NA
Number of rows 1
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 1 0
patient_group 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 58 NA 58 58 58 58 58 ▁▁▇▁▁
skimr::skim(date_SYMP)
Data summary
Name date_SYMP
Number of rows 13
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 13 0
patient_group 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 65.46 9.36 45 63 64 71 82 ▁▁▇▂▂
date_all <- do.call("rbind",list(date_CTR,date_ALS,date_mimic,date_PGMC,date_SYMP,date_NA))

# -> Kruskal Wallis test between samples
kruskal_test(data = date_all,age~patient_group)
dunn_test(data = date_all,age~patient_group)
## Sex analysis
Sex_CTR <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "CTR",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "CTR")
Sex_ALS <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "ALS",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "ALS")
Sex_mimic <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "mimic",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "mimic")
Sex_PGMC <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "PGMC",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "PGMC")
Sex_SYMP <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_plasma_data[IDs_plasma_data$type == "SYMP",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "SYMP")
Sex_NA = data.frame(Pseudonyme = "6H1HR8R7",sex = "F",patient_group = "N.A.")

Sex_all <- do.call("rbind",list(Sex_ALS,
                                Sex_CTR,
                                Sex_mimic,
                                Sex_PGMC,
                                Sex_SYMP,
                                Sex_NA))
Sex_all <- Sex_all %>% filter(!is.na(sex))
table(Sex_all$patient_group,Sex_all$sex)
##        
##          F  M
##   ALS   27 38
##   CTR   21 26
##   mimic  4 17
##   N.A.   1  0
##   PGMC  31 22
##   SYMP   2 11
fisher.test(Sex_all$patient_group,Sex_all$sex)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  Sex_all$patient_group and Sex_all$sex
## p-value = 0.005206
## alternative hypothesis: two.sided
pairwise_fisher_test(table(Sex_all$patient_group,Sex_all$sex), p.adjust.method = "holm")

- CSF & Plasma (patient IDs in both raw CSF and Plasma data)

IDs_CSF_plasma_data = rbind(IDs_CSF_data,
                            IDs_plasma_data) %>%
  select(-Tube_number) %>%
  distinct()

overview_country_CSF_plasma = IDs_CSF_plasma_data %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_CSF_plasma
overview_status_CSF_plasma = IDs_CSF_plasma_data %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,N.A.) %>%
  arrange(Visit,Country)
overview_status_CSF_plasma
# overview of ALS mutations
ALS_mutations_ID %>%
    select(PatientID,type) %>%
    filter(PatientID %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "ALS",]$PatientID %>%unique()))) %>%
  arrange(type)
# Date info
date_CTR <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "CTR",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "CTR")
date_ALS <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "ALS",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "ALS")
date_mimic <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "mimic",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "mimic")
date_PGMC <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "PGMC",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "PGMC")
date_SYMP <- Sex_age_all_participants %>%
  select(Pseudonyme,age) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "SYMP",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "SYMP")
date_NA = data.frame(Pseudonyme = "6H1HR8R7",age = 58,patient_group = "N.A.")

skim(date_CTR)
Data summary
Name date_CTR
Number of rows 47
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 47 0
patient_group 0 1 3 3 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 46.66 11.72 23 39 47 54.5 78 ▃▅▇▅▁
skim(date_ALS)
Data summary
Name date_ALS
Number of rows 65
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 65 0
patient_group 0 1 3 3 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 60.4 12.32 28 56 62 67 86 ▂▂▇▇▂
skim(date_mimic)
Data summary
Name date_mimic
Number of rows 21
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 21 0
patient_group 0 1 5 5 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 54.05 13.49 23 46 58 62 71 ▁▃▃▇▆
skim(date_PGMC)
Data summary
Name date_PGMC
Number of rows 53
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 53 0
patient_group 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 48.87 15.24 22 38 47 57 86 ▃▇▇▂▂
skimr::skim(date_NA)
Data summary
Name date_NA
Number of rows 1
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 1 0
patient_group 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 58 NA 58 58 58 58 58 ▁▁▇▁▁
skimr::skim(date_SYMP)
Data summary
Name date_SYMP
Number of rows 13
Number of columns 3
_______________________
Column type frequency:
character 2
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Pseudonyme 0 1 8 8 0 13 0
patient_group 0 1 4 4 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 65.46 9.36 45 63 64 71 82 ▁▁▇▂▂
#date_all <- do.call("rbind",list(date_CTR,date_ALS,date_mimic,date_PGMC,date_SYMP,date_NA))
date_all <- do.call("rbind",list(date_CTR,date_ALS,date_mimic,date_PGMC))

# -> Kruskal Wallis test between samples
kruskal_test(data = date_all,age~patient_group)
dunn_test(data = date_all,age~patient_group)
## Sex analysis
Sex_CTR <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "CTR",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "CTR")
Sex_ALS <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "ALS",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "ALS")
Sex_mimic <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "mimic",]$PatientID %>%unique()))) %>%
  mutate(patient_group = "mimic")
Sex_PGMC <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "PGMC",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "PGMC")
Sex_SYMP <- Sex_age_all_participants %>%
  select(Pseudonyme,sex) %>%
  filter(Pseudonyme %in% na.omit((IDs_CSF_plasma_data[IDs_CSF_plasma_data$type == "SYMP",]$PatientID %>%unique())))  %>%
  mutate(patient_group = "SYMP")
Sex_NA = data.frame(Pseudonyme = "6H1HR8R7",sex = "F",patient_group = "N.A.")

Sex_all <- do.call("rbind",list(Sex_ALS,
                                Sex_CTR,
                                Sex_mimic,
                                Sex_PGMC))
                                # Sex_SYMP,
                                # Sex_NA))
Sex_all <- Sex_all %>% filter(!is.na(sex))
table(Sex_all$patient_group,Sex_all$sex)
##        
##          F  M
##   ALS   27 38
##   CTR   21 26
##   mimic  4 17
##   PGMC  31 22
fisher.test(Sex_all$patient_group,Sex_all$sex)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  Sex_all$patient_group and Sex_all$sex
## p-value = 0.01791
## alternative hypothesis: two.sided
pairwise_fisher_test(table(Sex_all$patient_group,Sex_all$sex), p.adjust.method = "holm")

Bioinformatic analyses

Step 5: creation of summarised experiment datasets

The fifth step is to remove the QC rows, save the experimental setup (info of patient ID, batch, sample ID, visit, etc.), structure the lipidomics dataset to numeric and create summarised experiments datasets.

- CSF

# removal of QC rows
raw_data_CSF_use = raw_data_CSF_final %>%
  filter(!str_detect(Tube_number, "QC"))

tube_batch_CSF = raw_data_CSF_use[,1:2] %>%
  left_join(combined_data_CSF %>% select(Tube_number,Batch,Patient,Visit))
raw_data_CSF_use_mat = as.matrix(raw_data_CSF_use[,3:ncol(raw_data_CSF_use)])
rownames(raw_data_CSF_use_mat) = tube_batch_CSF %>% pull(Tube_number)

# matrix with raw lipidomics data (rows - lipids and column - tube ID)
lipidomics_CSF = as.data.frame(t(raw_data_CSF_use_mat))
dim(lipidomics_CSF)
## [1] 892 144
# make columns numeric
for(i in 1:ncol(lipidomics_CSF)){lipidomics_CSF[,i] = as.numeric(lipidomics_CSF[,i])}

#make a list for all dataframes
lipidomics_data_CSF = list(raw_data = lipidomics_CSF)

## summarized objects 
tube_batch_CSF = tube_batch_CSF %>% 
  left_join(IDs_CSF_data) %>%
  mutate(subtype = ifelse(type == "C9orf72","C9orf72",
                          ifelse(type == "SOD1","SOD1",
                                 ifelse(type == "FUS","FUS",
                                        ifelse(type == "other","other",
                                               ifelse(type == "TARDBP","TARDBP",NA)))))) %>%
  mutate(type = ifelse(type %in% c("C9orf72","SOD1","FUS","TARDBP","other"),
                       "PGMC",type)) %>%
  arrange(subtype) %>%
  distinct(Tube_number, Batch,Patient,PatientID,type, .keep_all = TRUE)

# Check which tube numbers do not have a patient ID or type assigned (majority uncertain PURE MOTOR SYMPTOM (PMS) / MND MIMIC / EARLY ALS (EALS))
tube_number_CSF_missing = tube_batch_CSF %>%
  filter(is.na(Patient) | is.na(type)) %>%
  arrange(Patient)
tube_number_CSF_missing
GeneralDocumentation %>%
  select(ParticipantCode,PatientID,PGMC,ALSuncertainty,ALSFUdiagnosis) %>%
  filter(ParticipantCode %in% tube_number_CSF_missing$Patient) %>%
  filter(!is.na(ParticipantCode))
abundance.columns <- 1:ncol(lipidomics_data_CSF$raw_data) # get abundance column numbers
      clin = data.frame(label = tube_batch_CSF$Tube_number, 
                        condition = tube_batch_CSF$type,
                        sub_condition = tube_batch_CSF$subtype,
                        batch = tube_batch_CSF$Batch)
      
      lipidomics_data_CSF$raw_data$name = lipidomics_data_CSF$raw_data$ID = rownames(lipidomics_data_CSF$raw_data)
      experimental.design = clin
      experimental.design <- experimental.design %>%
        group_by(condition) %>%            
        mutate(replicate = row_number()) %>%
        ungroup()
      
      lipidomics_data_CSF$se <- make_se(lipidomics_data_CSF$raw_data, abundance.columns, experimental.design)
lipidomics_data_CSF$se
## class: SummarizedExperiment 
## dim: 892 144 
## metadata(0):
## assays(1): ''
## rownames(892): CE(18:1-d7) Cholesterol(d7) ... PS(38:4)_PS(18:0/20:4)
##   PS(40:4)_PS(20:0/20:4)
## rowData names(2): ID name
## colnames(144): PGMC_1 PGMC_2 ... CTR_26 CTR_27
## colData names(6): label ID ... batch replicate
 writexl::write_xlsx(lipidomics_data_CSF$raw_data,"results/lipidomics_CSF_raw_data.xlsx")

- Plasma

# removal of QC rows
raw_data_plasma_use = raw_data_plasma_final %>%
  filter(!str_detect(Tube_number, "QC"))

tube_batch_plasma = raw_data_plasma_use[,1:2] %>%
  left_join(combined_data_plasma %>% select(Tube_number,Batch,Patient,Visit))
raw_data_plasma_use_mat = as.matrix(raw_data_plasma_use[,3:ncol(raw_data_plasma_use)])
rownames(raw_data_plasma_use_mat) = tube_batch_plasma %>% pull(Tube_number)

# matrix with raw lipidomics data (rows - lipids and column - tube ID)
lipidomics_plasma = as.data.frame(t(raw_data_plasma_use_mat))
dim(lipidomics_plasma)
## [1] 892 316
# make columns numeric
for(i in 1:ncol(lipidomics_plasma)){lipidomics_plasma[,i] = as.numeric(lipidomics_plasma[,i])}

#make a list for all dataframes
lipidomics_data_plasma = list(raw_data = lipidomics_plasma)

## summarized objects 
tube_batch_plasma = tube_batch_plasma %>% 
  left_join(IDs_plasma_data) %>%
  mutate(subtype = ifelse(type == "C9orf72","C9orf72",
                          ifelse(type == "SOD1","SOD1",
                                 ifelse(type == "FUS","FUS",
                                        ifelse(type == "other","other",
                                               ifelse(type == "TARDBP","TARDBP",NA)))))) %>%
  mutate(type = ifelse(type %in% c("C9orf72","SOD1","FUS","TARDBP","other"),
                       "PGMC",type)) %>%
  arrange(subtype) %>%
  distinct(Tube_number, Batch,Patient,PatientID,type, .keep_all = TRUE)

# Check which tube numbers do not have a patient ID or type assigned (majority uncertain PURE MOTOR SYMPTOM (PMS) / MND MIMIC / EARLY ALS (EALS))
tube_number_plasma_missing = tube_batch_plasma %>%
  filter(is.na(Patient) | is.na(type)) %>%
  arrange(Patient)
tube_number_plasma_missing
GeneralDocumentation %>%
  select(ParticipantCode,PatientID,PGMC,ALSuncertainty,ALSFUdiagnosis) %>%
  filter(ParticipantCode %in% tube_number_plasma_missing$Patient) %>%
  filter(!is.na(ParticipantCode))
abundance.columns <- 1:ncol(lipidomics_data_plasma$raw_data) # get abundance column numbers
      clin = data.frame(label = tube_batch_plasma$Tube_number, 
                        condition = tube_batch_plasma$type,
                        sub_condition = tube_batch_plasma$subtype,
                        batch = tube_batch_plasma$Batch)
      
      lipidomics_data_plasma$raw_data$name = lipidomics_data_plasma$raw_data$ID = rownames(lipidomics_data_plasma$raw_data)
      experimental.design = clin
      experimental.design <- experimental.design %>%
        group_by(condition) %>%            
        mutate(replicate = row_number()) %>%
        ungroup()
      
      lipidomics_data_plasma$se <- make_se(lipidomics_data_plasma$raw_data, abundance.columns, experimental.design)
 lipidomics_data_plasma$se
## class: SummarizedExperiment 
## dim: 892 316 
## metadata(0):
## assays(1): ''
## rownames(892): CE(18:1-d7) Cholesterol(d7) ... PS(38:4)_PS(18:0/20:4)
##   PS(40:4)_PS(20:0/20:4)
## rowData names(2): ID name
## colnames(316): PGMC_1 PGMC_2 ... mimic_32 ALS_95
## colData names(6): label ID ... batch replicate
 writexl::write_xlsx(lipidomics_data_plasma$raw_data,"results/lipidomics_plasma_raw_data.xlsx")

Step 6: missing analyses

The sixth step is to check for missing omics per patient and overall, filter lipids that are not quantified in less than 1/3 of the samples and normalise the data (using vsn).

- CSF

# heatmap missing 
vis_miss(lipidomics_data_CSF$raw_data,show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_CSF.png", width = 15, height = 10, units = "in")

# filter for lipids that are quantified in at least 2/3 of the samples
lipidomics_data_CSF$se_filtered = filter_proteins(lipidomics_data_CSF$se,"fraction",min = 0.66)

#dimensions of the data
dim(lipidomics_data_CSF$se)
## [1] 892 144
dim(lipidomics_data_CSF$se_filtered)
## [1] 186 144
# heatmap missing for filtered data
vis_miss(as.data.frame(assay(lipidomics_data_CSF$se_filtered)),show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_filtered_CSF.png", width = 12, height = 8, units = "in")

plot_frequency(lipidomics_data_CSF$se)

ggsave("plots/frequency_met_identification_raw_CSF.pdf", width = 11, height = 8, units = "in")
plot_frequency(lipidomics_data_CSF$se_filtered)

ggsave("plots/frequency_met_identification_filtered_CSF.pdf", width = 11, height = 8, units = "in")

# % missing per patient: 
round(apply(X = as.data.frame(assay(lipidomics_data_CSF$se)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_CSF$se))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##     77.5     76.3     77.9     77.5     77.4     77.7     77.0     77.2 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##     79.0     79.8     76.9     78.6     76.7     78.6     77.8     77.7 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##     78.3     78.6     77.1     76.6     77.5     77.6     78.9     79.0 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30    CTR_1  mimic_1 
##     78.6     78.0     77.5     77.2     78.4     77.8     76.5     73.0 
##    CTR_2  mimic_2    ALS_1    CTR_3   SYMP_1    ALS_2    ALS_3    CTR_4 
##     77.1     75.8     73.8     73.3     78.3     75.9     76.5     76.9 
##  mimic_3    ALS_4    CTR_5    CTR_6  mimic_4  mimic_5   SYMP_2    ALS_5 
##     76.1     76.2     75.9     77.2     77.0     74.8     78.0     76.3 
##  mimic_6    ALS_6    ALS_7    CTR_7  mimic_7    CTR_8  mimic_8    ALS_8 
##     76.3     78.5     76.1     76.3     76.1     76.8     76.1     75.8 
##    ALS_9    CTR_9   ALS_10  mimic_9   ALS_11   ALS_12   ALS_13 mimic_10 
##     76.6     76.9     75.7     72.5     74.6     73.9     73.5     76.9 
##   ALS_14   SYMP_3   ALS_15   CTR_10 mimic_11   CTR_11   ALS_16   CTR_12 
##     78.3     76.2     78.4     77.7     78.4     78.6     75.0     72.9 
##   CTR_13   CTR_14 mimic_12   CTR_15   CTR_16   SYMP_4 mimic_13   ALS_17 
##     78.4     77.1     77.6     77.6     78.0     77.8     75.4     77.9 
##   ALS_18 mimic_14   CTR_17   ALS_19   ALS_20 mimic_15   ALS_21   ALS_22 
##     79.1     75.6     76.9     77.0     78.6     76.6     79.9     75.8 
##   CTR_18 mimic_16   ALS_23   ALS_24   SYMP_5   CTR_19   ALS_25   ALS_26 
##     77.9     76.0     76.2     75.3     77.9     77.7     78.1     78.8 
##   ALS_27   ALS_28   SYMP_6   ALS_29   ALS_30   SYMP_7   ALS_31   ALS_32 
##     78.5     79.3     80.0     78.0     79.0     78.1     77.5     78.0 
##   ALS_33   CTR_20   SYMP_8 mimic_17   SYMP_9   CTR_21   CTR_22   ALS_34 
##     75.2     79.0     77.9     77.5     76.5     77.2     80.4     81.1 
##   ALS_35   ALS_36   ALS_37   CTR_23   ALS_38   ALS_39   ALS_40 mimic_18 
##     76.6     74.4     79.3     79.1     77.1     78.7     77.9     78.8 
##   ALS_41   N.A._1  SYMP_10   CTR_24   ALS_42   ALS_43   ALS_44   ALS_45 
##     77.5     79.4     78.1     79.0     77.8     76.0     76.3     77.7 
##   CTR_25 mimic_19 mimic_20   ALS_46   ALS_47   ALS_48   ALS_49   ALS_50 
##     78.8     79.8     77.9     59.3     78.3     78.4     78.0     76.6 
##   ALS_51   N.A._2 mimic_21   ALS_52 mimic_22   ALS_53   CTR_26   CTR_27 
##     74.9     81.6     80.2     77.4     76.9     77.0     78.3     78.5
round(apply(X = as.data.frame(assay(lipidomics_data_CSF$se_filtered)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_CSF$se_filtered))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##      1.6      2.2      5.4      1.1      2.2      2.2      0.5      2.7 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##      6.5      6.5      7.0      3.2      4.8      3.8      3.8      3.8 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##      3.2      4.8      2.2      0.5      2.2      2.7      7.0      3.8 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30    CTR_1  mimic_1 
##      3.2      3.8      3.8      4.8      3.2      2.7      2.2      0.0 
##    CTR_2  mimic_2    ALS_1    CTR_3   SYMP_1    ALS_2    ALS_3    CTR_4 
##      5.4      0.5      0.5      0.0      3.2      1.1      1.1      1.1 
##  mimic_3    ALS_4    CTR_5    CTR_6  mimic_4  mimic_5   SYMP_2    ALS_5 
##      1.6      2.7      1.1      2.7      1.6      2.2      3.8      1.1 
##  mimic_6    ALS_6    ALS_7    CTR_7  mimic_7    CTR_8  mimic_8    ALS_8 
##      2.2      4.8      1.6      1.1      1.1      2.2      1.1      0.5 
##    ALS_9    CTR_9   ALS_10  mimic_9   ALS_11   ALS_12   ALS_13 mimic_10 
##      1.1      1.6      1.1      2.2      0.5      0.5      0.5      1.6 
##   ALS_14   SYMP_3   ALS_15   CTR_10 mimic_11   CTR_11   ALS_16   CTR_12 
##      6.5      2.2      3.2      1.6      3.8      2.7      0.5      2.7 
##   CTR_13   CTR_14 mimic_12   CTR_15   CTR_16   SYMP_4 mimic_13   ALS_17 
##      2.7      1.6      3.8      2.2      2.7      3.8      1.1      2.2 
##   ALS_18 mimic_14   CTR_17   ALS_19   ALS_20 mimic_15   ALS_21   ALS_22 
##      6.5      1.1      1.1      1.1      4.3      2.2      8.1      1.1 
##   CTR_18 mimic_16   ALS_23   ALS_24   SYMP_5   CTR_19   ALS_25   ALS_26 
##      2.2      1.6      1.6      1.1      2.2      2.2      3.2      4.3 
##   ALS_27   ALS_28   SYMP_6   ALS_29   ALS_30   SYMP_7   ALS_31   ALS_32 
##      3.2      5.9      9.1      5.4      6.5      4.3      4.8      2.7 
##   ALS_33   CTR_20   SYMP_8 mimic_17   SYMP_9   CTR_21   CTR_22   ALS_34 
##      3.2      5.4      4.3      3.2      2.2      2.7     11.3     14.0 
##   ALS_35   ALS_36   ALS_37   CTR_23   ALS_38   ALS_39   ALS_40 mimic_18 
##      2.7      1.1      8.1      6.5      1.6      7.0      3.2      5.4 
##   ALS_41   N.A._1  SYMP_10   CTR_24   ALS_42   ALS_43   ALS_44   ALS_45 
##      1.6      6.5      3.2      7.5      4.3      2.7      0.5      2.2 
##   CTR_25 mimic_19 mimic_20   ALS_46   ALS_47   ALS_48   ALS_49   ALS_50 
##      7.0      9.7      3.8      0.5      3.8      3.8      3.8      1.6 
##   ALS_51   N.A._2 mimic_21   ALS_52 mimic_22   ALS_53   CTR_26   CTR_27 
##      0.5     17.7      8.6      3.2      1.6      3.2      4.8      5.4
#normalization
lipidomics_data_CSF$se_filt_norm <- normalize_vsn(lipidomics_data_CSF$se_filtered)
DEP::meanSdPlot(lipidomics_data_CSF$se_filt_norm)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the vsn package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filtered)),"results/data_filtered_CSF.xlsx")
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filt_norm)),"results/data_filtered_normalized_CSF.xlsx")

# imputation
lipidomics_data_CSF$se_filt_impute <- impute(
  lipidomics_data_CSF$se_filt_norm,fun = "MinProb",q = 0.01)
## Imputing along margin 2 (samples/columns).
## [1] 0.4905946
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filt_impute)),"results/data_filtered_normalized_imputed_CSF.xlsx")

- Plasma

# heatmap missing 
vis_miss(lipidomics_data_plasma$raw_data,show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_plasma.png", width = 15, height = 10, units = "in")

# filter for lipids that are quantified in at least 2/3 of the samples
lipidomics_data_plasma$se_filtered = filter_proteins(lipidomics_data_plasma$se,"fraction",min = 0.66)

#dimensions of the data
dim(lipidomics_data_plasma$se)
## [1] 892 316
dim(lipidomics_data_plasma$se_filtered)
## [1] 684 316
# heatmap missing for filtered data
vis_miss(as.data.frame(assay(lipidomics_data_plasma$se_filtered)),show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_filtered_plasma.png", width = 12, height = 8, units = "in")

plot_frequency(lipidomics_data_plasma$se)

ggsave("plots/frequency_met_identification_raw_plasma.pdf", width = 11, height = 8, units = "in")
plot_frequency(lipidomics_data_plasma$se_filtered)

ggsave("plots/frequency_met_identification_filtered_plasma.pdf", width = 11, height = 8, units = "in")

# % missing per patient: 
round(apply(X = as.data.frame(assay(lipidomics_data_plasma$se)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_plasma$se))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##     16.6     18.7     19.7     19.1     19.5     18.8     20.0     20.7 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##     22.0     22.6     23.1     23.7     22.3     21.1     21.3     22.2 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##     22.4     22.3     27.9     29.1     24.3     27.5     28.4     24.4 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30  PGMC_31  PGMC_32 
##     26.9     26.8     24.8     23.1     22.1     22.6     22.1     16.8 
##  PGMC_33  PGMC_34  PGMC_35  PGMC_36  PGMC_37  PGMC_38  PGMC_39  PGMC_40 
##     16.7     16.7     18.6     20.2     19.3     17.8     18.0     18.5 
##  PGMC_41  PGMC_42  PGMC_43  PGMC_44  PGMC_45  PGMC_46  PGMC_47  PGMC_48 
##     20.4     20.0     16.6     18.2     18.5     19.1     17.9     18.5 
##  PGMC_49  PGMC_50  PGMC_51  PGMC_52  PGMC_53  PGMC_54  PGMC_55  PGMC_56 
##     17.9     19.8     23.8     25.0     20.0     20.1     21.5     21.3 
##  PGMC_57  PGMC_58  PGMC_59  PGMC_60  PGMC_61  PGMC_62  PGMC_63  PGMC_64 
##     22.4     23.0     27.5     17.4     15.5     17.9     18.2     16.4 
##  PGMC_65  PGMC_66  PGMC_67  PGMC_68  PGMC_69  PGMC_70  PGMC_71  PGMC_72 
##     16.8     17.5     18.6     16.7     18.0     23.3     23.0     16.1 
##  PGMC_73  PGMC_74  PGMC_75  PGMC_76  PGMC_77  PGMC_78  PGMC_79  PGMC_80 
##     19.4     19.1     20.1     18.7     23.1     18.3     21.6     20.6 
##  PGMC_81  PGMC_82  PGMC_83  PGMC_84  PGMC_85    ALS_1    CTR_1    CTR_2 
##     19.1     19.7     22.9     28.1     25.4     18.2     17.9     17.6 
##    ALS_2    CTR_3    ALS_3  mimic_1    CTR_4    ALS_4    ALS_5    ALS_6 
##     16.0     17.8     17.9     20.9     15.7     16.8     15.8     17.2 
##   SYMP_1  mimic_2    ALS_7    CTR_5    CTR_6  mimic_3    ALS_8    ALS_9 
##     16.4     15.7     14.1     16.1     15.5     18.3     18.4     18.2 
##    CTR_7    CTR_8    CTR_9   SYMP_2   CTR_10   SYMP_3   CTR_11   ALS_10 
##     21.7     16.6     16.8     17.2     18.8     16.7     18.2     16.4 
##   ALS_11   ALS_12   CTR_12   SYMP_4  mimic_4   ALS_13   CTR_13   ALS_14 
##     16.5     14.8     17.4     14.8     17.4     15.4     16.0     15.4 
##   ALS_15   CTR_14  mimic_5  mimic_6  mimic_7   ALS_16   CTR_15   CTR_16 
##     16.8     17.9     18.3     20.0     19.2     21.9     18.0     17.2 
##  mimic_8   ALS_17   CTR_17   ALS_18   CTR_18  mimic_9   CTR_19   CTR_20 
##     17.0     18.4     17.6     17.2     19.8     19.3     17.5     18.5 
## mimic_10   CTR_21   CTR_22 mimic_11   ALS_19   CTR_23 mimic_12   ALS_20 
##     17.5     17.8     20.1     18.5     16.8     19.2     20.1     17.7 
##   ALS_21   ALS_22   ALS_23   ALS_24   CTR_24   CTR_25   ALS_25   CTR_26 
##     23.8     16.1     19.5     17.3     21.0     19.7     21.1     19.3 
##   CTR_27   ALS_26   ALS_27   CTR_28   CTR_29   CTR_30   CTR_31   CTR_32 
##     20.0     18.9     17.2     16.7     17.3     18.9     19.6     19.3 
##   CTR_33   ALS_28   ALS_29 mimic_13   ALS_30   CTR_34   ALS_31   CTR_35 
##     19.1     19.4     18.3     17.3     19.1     21.1     18.2     20.3 
##   ALS_32   ALS_33   CTR_36   CTR_37   ALS_34   ALS_35   ALS_36   SYMP_5 
##     18.3     18.7     17.4     20.5     16.9     19.4     17.9     19.7 
##   CTR_38   ALS_37   CTR_39   CTR_40   ALS_38 mimic_14   ALS_39 mimic_15 
##     19.1     16.1     19.1     18.7     20.9     17.9     20.0     20.4 
##   CTR_41   CTR_42   CTR_43   ALS_40   ALS_41   CTR_44   SYMP_6   ALS_42 
##     21.1     22.1     20.1     22.4     20.1     21.5     20.9     20.5 
##   ALS_43   ALS_44   SYMP_7   SYMP_8   CTR_45   ALS_45   CTR_46 mimic_16 
##     22.5     23.4     23.3     20.4     20.2     20.6     23.5     20.1 
## mimic_17   CTR_47   CTR_48   ALS_46   CTR_49   CTR_50 mimic_18   ALS_47 
##     20.0     21.6     21.0     21.6     21.3     20.6     21.3     20.2 
##   CTR_51   CTR_52 mimic_19   CTR_53   CTR_54   CTR_55   CTR_56   ALS_48 
##     21.3     21.5     20.9     21.1     21.6     22.6     21.3     21.9 
##   ALS_49   ALS_50   CTR_57   CTR_58   CTR_59   CTR_60   ALS_51   CTR_61 
##     25.6     24.0     22.8     21.6     23.1     23.2     20.6     24.3 
##   ALS_52   CTR_62   ALS_53   CTR_63   ALS_54   ALS_55   ALS_56   CTR_64 
##     20.9     21.4     23.4     22.1     24.3     22.3     25.1     23.2 
##   ALS_57 mimic_20   ALS_58   SYMP_9   ALS_59  SYMP_10   ALS_60   ALS_61 
##     22.1     21.7     22.9     22.8     22.3     24.3     22.1     23.1 
##   ALS_62   ALS_63   ALS_64   ALS_65 mimic_21 mimic_22 mimic_23   CTR_65 
##     22.4     21.4     23.7     22.0     22.8     24.7     25.2     23.4 
##   CTR_66 mimic_24 mimic_25   CTR_67   ALS_66   CTR_68   N.A._1   ALS_67 
##     22.2     23.7     22.9     24.2     22.2     25.4     21.4     23.0 
##  SYMP_11   CTR_69   ALS_68   CTR_70 mimic_26  SYMP_12   ALS_69 mimic_27 
##     23.3     21.7     23.3     23.3     21.6     22.2     26.3     22.4 
##   ALS_70  SYMP_13   CTR_71   CTR_72   ALS_71   ALS_72 mimic_28   ALS_73 
##     23.2     22.8     23.9     23.2     23.2     25.2     21.6     22.6 
##   ALS_74   CTR_73   CTR_74   CTR_75   ALS_75   CTR_76   ALS_76   CTR_77 
##     22.9     20.1     23.1     23.9     26.0     22.3     23.3     23.5 
##   ALS_77   ALS_78   N.A._2 mimic_29  SYMP_14   CTR_78   CTR_79   ALS_79 
##     23.3     23.4     22.2     23.4     28.9     27.6     27.7     31.1 
##   CTR_80   ALS_80  SYMP_15   ALS_81   ALS_82  PGMC_86   ALS_83 mimic_30 
##     29.3     25.0     25.1     27.0     27.0     25.7     28.9     24.4 
##   CTR_81   ALS_84   ALS_85   ALS_86   ALS_87   ALS_88  SYMP_16   ALS_89 
##     25.8     27.5     25.8     25.9     28.1     25.7     26.3     25.4 
##   CTR_82   ALS_90   ALS_91   CTR_83  SYMP_17   ALS_92   ALS_93  SYMP_18 
##     23.0     24.2     25.3     26.7     23.0     25.7     24.7     25.8 
## mimic_31   ALS_94 mimic_32   ALS_95 
##     26.9     27.1     27.6     23.2
round(apply(X = as.data.frame(assay(lipidomics_data_plasma$se_filtered)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_plasma$se_filtered))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##      1.6      2.5      4.8      3.9      2.3      1.8      0.9      1.2 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##      1.6      2.6      2.9      3.4      1.8      0.7      1.2      1.3 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##      1.5      1.8      6.6      7.9      3.7      6.3      7.6      2.9 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30  PGMC_31  PGMC_32 
##      5.8      5.7      3.9      2.8      1.3      1.6      1.6      1.3 
##  PGMC_33  PGMC_34  PGMC_35  PGMC_36  PGMC_37  PGMC_38  PGMC_39  PGMC_40 
##      1.3      0.7      1.3      2.6      2.3      1.5      2.0      1.3 
##  PGMC_41  PGMC_42  PGMC_43  PGMC_44  PGMC_45  PGMC_46  PGMC_47  PGMC_48 
##      3.4      2.8      0.6      1.2      0.9      2.0      1.3      0.9 
##  PGMC_49  PGMC_50  PGMC_51  PGMC_52  PGMC_53  PGMC_54  PGMC_55  PGMC_56 
##      1.3      1.6      2.9      5.8      1.2      1.0      2.2      1.5 
##  PGMC_57  PGMC_58  PGMC_59  PGMC_60  PGMC_61  PGMC_62  PGMC_63  PGMC_64 
##      1.8      3.5      6.1      1.8      0.4      2.8      2.2      0.6 
##  PGMC_65  PGMC_66  PGMC_67  PGMC_68  PGMC_69  PGMC_70  PGMC_71  PGMC_72 
##      2.0      1.2      1.3      0.4      0.9      1.9      2.3      0.4 
##  PGMC_73  PGMC_74  PGMC_75  PGMC_76  PGMC_77  PGMC_78  PGMC_79  PGMC_80 
##      2.9      1.0      2.8      0.9      5.4      1.3      1.2      1.0 
##  PGMC_81  PGMC_82  PGMC_83  PGMC_84  PGMC_85    ALS_1    CTR_1    CTR_2 
##      0.4      1.0      1.8      7.0      4.2      2.8      0.9      1.8 
##    ALS_2    CTR_3    ALS_3  mimic_1    CTR_4    ALS_4    ALS_5    ALS_6 
##      0.4      0.9      0.9      4.2      1.0      0.6      0.7      1.9 
##   SYMP_1  mimic_2    ALS_7    CTR_5    CTR_6  mimic_3    ALS_8    ALS_9 
##      1.0      0.6      0.9      1.3      0.7      2.3      1.9      1.9 
##    CTR_7    CTR_8    CTR_9   SYMP_2   CTR_10   SYMP_3   CTR_11   ALS_10 
##      5.8      0.3      0.9      0.9      1.3      1.0      1.5      0.9 
##   ALS_11   ALS_12   CTR_12   SYMP_4  mimic_4   ALS_13   CTR_13   ALS_14 
##      0.6      0.1      1.0      0.4      1.8      0.7      0.1      0.4 
##   ALS_15   CTR_14  mimic_5  mimic_6  mimic_7   ALS_16   CTR_15   CTR_16 
##      0.7      2.0      0.3      2.9      1.9      4.1      0.6      1.0 
##  mimic_8   ALS_17   CTR_17   ALS_18   CTR_18  mimic_9   CTR_19   CTR_20 
##      0.9      1.6      1.0      1.3      1.9      2.0      2.3      1.6 
## mimic_10   CTR_21   CTR_22 mimic_11   ALS_19   CTR_23 mimic_12   ALS_20 
##      0.9      1.5      2.2      0.9      0.3      1.5      1.3      0.6 
##   ALS_21   ALS_22   ALS_23   ALS_24   CTR_24   CTR_25   ALS_25   CTR_26 
##      6.3      0.6      1.3      0.3      2.8      1.9      3.9      1.5 
##   CTR_27   ALS_26   ALS_27   CTR_28   CTR_29   CTR_30   CTR_31   CTR_32 
##      1.9      1.0      0.6      0.4      0.4      2.2      2.0      1.8 
##   CTR_33   ALS_28   ALS_29 mimic_13   ALS_30   CTR_34   ALS_31   CTR_35 
##      2.2      1.5      1.9      0.4      2.5      2.6      1.0      3.5 
##   ALS_32   ALS_33   CTR_36   CTR_37   ALS_34   ALS_35   ALS_36   SYMP_5 
##      0.7      1.0      0.4      4.1      0.9      2.6      1.6      1.5 
##   CTR_38   ALS_37   CTR_39   CTR_40   ALS_38 mimic_14   ALS_39 mimic_15 
##      1.6      0.1      1.0      2.0      2.9      0.9      1.2      1.3 
##   CTR_41   CTR_42   CTR_43   ALS_40   ALS_41   CTR_44   SYMP_6   ALS_42 
##      1.2      2.0      0.7      1.5      1.2      2.2      0.9      1.2 
##   ALS_43   ALS_44   SYMP_7   SYMP_8   CTR_45   ALS_45   CTR_46 mimic_16 
##      2.8      3.8      2.9      0.6      1.0      2.0      3.9      0.6 
## mimic_17   CTR_47   CTR_48   ALS_46   CTR_49   CTR_50 mimic_18   ALS_47 
##      0.9      1.2      0.9      2.2      1.9      0.6      0.9      1.2 
##   CTR_51   CTR_52 mimic_19   CTR_53   CTR_54   CTR_55   CTR_56   ALS_48 
##      0.7      1.0      0.9      0.6      1.2      2.5      0.9      1.2 
##   ALS_49   ALS_50   CTR_57   CTR_58   CTR_59   CTR_60   ALS_51   CTR_61 
##      5.4      3.5      1.8      1.6      1.0      2.5      0.6      4.1 
##   ALS_52   CTR_62   ALS_53   CTR_63   ALS_54   ALS_55   ALS_56   CTR_64 
##      0.6      1.3      4.5      1.9      3.1      1.3      3.9      2.3 
##   ALS_57 mimic_20   ALS_58   SYMP_9   ALS_59  SYMP_10   ALS_60   ALS_61 
##      1.5      1.6      2.3      1.9      1.9      2.9      1.6      2.0 
##   ALS_62   ALS_63   ALS_64   ALS_65 mimic_21 mimic_22 mimic_23   CTR_65 
##      1.8      0.7      2.0      1.3      1.9      4.5      3.9      2.2 
##   CTR_66 mimic_24 mimic_25   CTR_67   ALS_66   CTR_68   N.A._1   ALS_67 
##      1.5      2.6      2.0      2.9      2.2      3.9      1.2      1.9 
##  SYMP_11   CTR_69   ALS_68   CTR_70 mimic_26  SYMP_12   ALS_69 mimic_27 
##      2.2      1.2      3.1      2.6      1.2      1.8      6.0      1.5 
##   ALS_70  SYMP_13   CTR_71   CTR_72   ALS_71   ALS_72 mimic_28   ALS_73 
##      1.9      2.2      3.1      2.2      2.6      4.5      1.2      1.8 
##   ALS_74   CTR_73   CTR_74   CTR_75   ALS_75   CTR_76   ALS_76   CTR_77 
##      1.6      0.9      2.8      3.5      6.1      2.2      2.0      2.8 
##   ALS_77   ALS_78   N.A._2 mimic_29  SYMP_14   CTR_78   CTR_79   ALS_79 
##      3.4      2.6      1.5      2.5      8.0      6.7      6.4     10.8 
##   CTR_80   ALS_80  SYMP_15   ALS_81   ALS_82  PGMC_86   ALS_83 mimic_30 
##      9.1      4.2      4.2      6.6      5.7      4.8      8.8      3.2 
##   CTR_81   ALS_84   ALS_85   ALS_86   ALS_87   ALS_88  SYMP_16   ALS_89 
##      4.5      7.0      4.7      4.7      7.9      4.7      4.7      4.5 
##   CTR_82   ALS_90   ALS_91   CTR_83  SYMP_17   ALS_92   ALS_93  SYMP_18 
##      2.2      3.4      4.4      5.4      2.5      4.7      3.7      4.2 
## mimic_31   ALS_94 mimic_32   ALS_95 
##      6.3      6.7      6.4      2.6
#normalization
lipidomics_data_plasma$se_filt_norm <- normalize_vsn(lipidomics_data_plasma$se_filtered)
DEP::meanSdPlot(lipidomics_data_plasma$se_filt_norm)

writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filtered)),"results/data_filtered_plasma.xlsx")
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filt_norm)),"results/data_filtered_normalized_plasma.xlsx")

# imputation
lipidomics_data_plasma$se_filt_impute <- impute(
  lipidomics_data_plasma$se_filt_norm,fun = "MinProb",q = 0.01)
## Imputing along margin 2 (samples/columns).
## [1] 0.6924497
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filt_impute)),"results/data_filtered_normalized_imputed_plasma.xlsx")

Step 7: Visualizations

The seventh step is to create visualizations, such as heatmaps and PCAs (for each fluid with ID/batch info, and for both fluids together)

- CSF

## Distribution of expression values per ID and tube number
mean_expression_plot(data = t(assay(lipidomics_data_CSF$se)), 
                      file_sample = "plots/boxplots_expression_each_sample_CSF.pdf",
                      file_mass = "plots/boxplots_expression_each_lipid_CSF.pdf",
                      title = "CSF lipidomics")

## Heatmaps
# CSF
title = "lipidomics CSF" 

data = assay(lipidomics_data_CSF$se)
data_z = t(scale(t(data),center = TRUE,scale = TRUE))
data_z[is.na(data_z)] <- 0

#row annotation
names_lipids = rownames(data)
Lipids_final <- read_excel("data input/RE_ metabolomics/Lipids_final.xlsx")
lipids = Lipids_final %>% 
  select(Name,Class) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
lipids_overview = lipids %>%
  group_by(Class) %>%
  summarise(n_lipids = n()) %>%
  arrange(desc(n_lipids))
writexl::write_xlsx(lipids_overview,"results/lipids_overview.xlsx")
annotation_row = data.frame(lipid_type = as.factor(lipids$Class))
rownames(annotation_row) = lipids$Name
participant_type = tube_batch_CSF
annotation_col = data.frame(type = as.factor(lipidomics_data_CSF$se$condition))
rownames(annotation_col) = lipidomics_data_CSF$se$ID
annotation_col$type <- factor(
  annotation_col$type,
  levels = c("ALS","PGMC","CTR","mimic","SYMP", "N.A."))
colors_type = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4","orange3")
names(colors_type) = c("ALS","CTR","mimic","N.A.","PGMC","SYMP")
mycolors = colorRampPalette(brewer.pal(9, "Set1"))(length(unique(annotation_row$lipid_type)))
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = mycolors,
  type = colors_type)

# order data
row_order <- order(annotation_row$lipid_type)  
col_order <- order(annotation_col$type)
data          <- data[row_order, col_order]
data_z          <- data_z[row_order, col_order]
annotation_row <- annotation_row[row_order, , drop = FALSE]
annotation_col <- annotation_col[col_order, , drop = FALSE]

#without grouping, all lipids, expression data
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap all lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  #color = colorRampPalette(c("navy", "white", "firebrick3"))(100))
                  color = viridis::magma(100, direction = -1))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, all lipids, z-score
p = make_pheatmap(data = data_z, 
                  cluster_cols = F, 
                  main = paste0("Heatmap all lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  color = colorRampPalette(c("navy", "white", "firebrick3"))(100))

                  #color = viridis::magma(100, direction = -1))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_zscore_raw_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered lipids
data = assay(lipidomics_data_CSF$se_filt_norm)
data_z = t(scale(t(data),center = TRUE,scale = TRUE))
data_z[is.na(data_z)] <- 0

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Class) %>%
  filter(Name %in% names_lipids)
idx <- match(names_lipids, lipids$Name)
keep <- !is.na(idx)
lipids <- lipids[idx[keep], ]
data   <- data[keep, , drop = FALSE]
annotation_row = data.frame(lipid_type = as.factor(lipids$Class))
rownames(annotation_row) = lipids$Name

# order data
row_order <- order(annotation_row$lipid_type)  
data          <- data[row_order, col_order]
data_z          <- data_z[row_order, col_order]
annotation_row <- annotation_row[row_order, , drop = FALSE]

# expression data
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  #color = colorRampPalette(c("navy", "white", "firebrick3"))(100))
                  color = viridis::magma(100, direction = -1))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_filtered_",title,".pdf"))
## quartz_off_screen 
##                 2
# z-score
p = make_pheatmap(data = data_z, 
                  cluster_cols = F, 
                  main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  color = colorRampPalette(c("navy", "white", "firebrick3"))(100))

                  #color = viridis::magma(100, direction = -1))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_zscore_filtered_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered and imputed
data = assay(lipidomics_data_CSF$se_filt_impute)
data_z = t(scale(t(data),center = TRUE,scale = TRUE))
data_z[is.na(data_z)] <- 0

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Class) %>%
  filter(Name %in% names_lipids)
idx <- match(names_lipids, lipids$Name)
keep <- !is.na(idx)
lipids <- lipids[idx[keep], ]
data   <- data[keep, , drop = FALSE]
annotation_row = data.frame(lipid_type = as.factor(lipids$Class))
rownames(annotation_row) = lipids$Name

# order data
row_order <- order(annotation_row$lipid_type)  
data          <- data[row_order, col_order]
data_z          <- data_z[row_order, col_order]
annotation_row <- annotation_row[row_order, , drop = FALSE]

# expression data
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  #color = colorRampPalette(c("navy", "white", "firebrick3"))(100))
                  color = viridis::magma(100, direction = -1))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_",title,".pdf"))
## quartz_off_screen 
##                 2
# z-score
p = make_pheatmap(data = data_z, 
                  cluster_cols = F, 
                  main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  color = colorRampPalette(c("navy", "white", "firebrick3"))(100))

                  #color = viridis::magma(100, direction = -1))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_zscore_imputed_",title,".pdf"))
## quartz_off_screen 
##                 2
# without grouping, 100 most variable lipids
d = assay(lipidomics_data_CSF$se_filt_impute)
d2 = head(order(rowVars(d),decreasing = T),100)
data = d[d2,]
data_z = t(scale(t(data),center = TRUE,scale = TRUE))
data_z[is.na(data_z)] <- 0

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Class) %>%
  filter(Name %in% names_lipids)
idx <- match(names_lipids, lipids$Name)
keep <- !is.na(idx)
lipids <- lipids[idx[keep], ]
data   <- data[keep, , drop = FALSE]
annotation_row = data.frame(lipid_type = as.factor(lipids$Class))
rownames(annotation_row) = lipids$Name

# order data
row_order <- order(annotation_row$lipid_type)  
data          <- data[row_order, col_order] 
data_z          <- data_z[row_order, col_order]
annotation_row <- annotation_row[row_order, , drop = FALSE]

# expression data
p = make_pheatmap(data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
                  labels_col = colnames(d),
                  #color = colorRampPalette(c("navy", "white", "firebrick3"))(100))
                  color = viridis::magma(100, direction = -1))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_mostvar_",title,".pdf"))
## quartz_off_screen 
##                 2
# z-score
p = make_pheatmap(data_z, 
                  cluster_cols = F, 
                  main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
                  labels_col = colnames(d),
                  color = colorRampPalette(c("navy", "white", "firebrick3"))(100))

                  #color = viridis::magma(100, direction = -1))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_zscore_imputed_mostvar_",title,".pdf"))
## quartz_off_screen 
##                 2
## UMAP (with participant status type)
d_CSF = t(assay(lipidomics_data_CSF$se_filt_impute))
labels_group = tube_batch_CSF$type
group = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4","orange3")
UMAP_density_plot(data = d_CSF,
                          ggtitle = paste0("UMAP with type labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_type_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_type_labels_",title,".pdf"),
                          colour_set = group)

# UMAP (with batch info)
labels_group = tube_batch_CSF$Batch
group = c("#3DB7E4", "#FF8849", "#69BE28")
UMAP_density_plot(data = d_CSF,
                          ggtitle = paste0("UMAP with batch labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_batch_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_batch_labels_",title,".pdf"),
                          colour_set = group)

- Plasma

## Distribution of expression values per ID and tube number
mean_expression_plot(data = t(assay(lipidomics_data_plasma$se)), 
                      file_sample = "plots/boxplots_expression_each_sample_plasma.pdf",
                      file_mass = "plots/boxplots_expression_each_lipid_plasma.pdf",
                      title = "plasma lipidomics")

## Heatmaps
# plasma
title = "lipidomics plasma" 

data = assay(lipidomics_data_plasma$se)
data_z = t(scale(t(data),center = TRUE,scale = TRUE))
data_z[is.na(data_z)] <- 0

#row annotation
names_lipids = rownames(data)
Lipids_final <- read_excel("data input/RE_ metabolomics/Lipids_final.xlsx")
lipids = Lipids_final %>% 
  select(Name,Class) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Class))
rownames(annotation_row) = lipids$Name
participant_type = tube_batch_plasma
annotation_col = data.frame(type = as.factor(lipidomics_data_plasma$se$condition))
rownames(annotation_col) = lipidomics_data_plasma$se$ID
annotation_col$type <- factor(
  annotation_col$type,
  levels = c("ALS","PGMC","CTR","mimic","SYMP", "N.A."))
colors_type = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4","orange3")
names(colors_type) = c("ALS","CTR","mimic","N.A.","PGMC","SYMP")
mycolors = colorRampPalette(brewer.pal(9, "Set1"))(length(unique(annotation_row$lipid_type)))
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = mycolors,
  type = colors_type)

# order data
row_order <- order(annotation_row$lipid_type)  
col_order <- order(annotation_col$type)
data          <- data[row_order, col_order]
data_z          <- data_z[row_order, col_order]
annotation_row <- annotation_row[row_order, , drop = FALSE]
annotation_col <- annotation_col[col_order, , drop = FALSE]

#without grouping, all lipids, expression data
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap all lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  #color = colorRampPalette(c("navy", "white", "firebrick3"))(100))
                  color = viridis::magma(100, direction = -1))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, all lipids, z-score
p = make_pheatmap(data = data_z, 
                  cluster_cols = F, 
                  main = paste0("Heatmap all lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  color = colorRampPalette(c("navy", "white", "firebrick3"))(100))

                  #color = viridis::magma(100, direction = -1))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_zscore_raw_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered lipids
data = assay(lipidomics_data_plasma$se_filt_norm)
data_z = t(scale(t(data),center = TRUE,scale = TRUE))
data_z[is.na(data_z)] <- 0

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Class) %>%
  filter(Name %in% names_lipids)
idx <- match(names_lipids, lipids$Name)
keep <- !is.na(idx)
lipids <- lipids[idx[keep], ]
data   <- data[keep, , drop = FALSE]
annotation_row = data.frame(lipid_type = as.factor(lipids$Class))
rownames(annotation_row) = lipids$Name

# order data
row_order <- order(annotation_row$lipid_type)  
data          <- data[row_order, col_order]
data_z          <- data_z[row_order, col_order]
annotation_row <- annotation_row[row_order, , drop = FALSE]

# expression data
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  #color = colorRampPalette(c("navy", "white", "firebrick3"))(100))
                  color = viridis::magma(100, direction = -1))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_filtered_",title,".pdf"))
## quartz_off_screen 
##                 2
# z-score
p = make_pheatmap(data = data_z, 
                  cluster_cols = F, 
                  main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  color = colorRampPalette(c("navy", "white", "firebrick3"))(100))

                  #color = viridis::magma(100, direction = -1))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_zscore_filtered_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered and imputed
data = assay(lipidomics_data_plasma$se_filt_impute)
data_z = t(scale(t(data),center = TRUE,scale = TRUE))
data_z[is.na(data_z)] <- 0

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Class) %>%
  filter(Name %in% names_lipids)
idx <- match(names_lipids, lipids$Name)
keep <- !is.na(idx)
lipids <- lipids[idx[keep], ]
data   <- data[keep, , drop = FALSE]
annotation_row = data.frame(lipid_type = as.factor(lipids$Class))
rownames(annotation_row) = lipids$Name

# order data
row_order <- order(annotation_row$lipid_type)  
data          <- data[row_order, col_order]
data_z <- data_z[row_order, col_order]
annotation_row <- annotation_row[row_order, , drop = FALSE]

# expression data
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  #color = colorRampPalette(c("navy", "white", "firebrick3"))(100))
                  color = viridis::magma(100, direction = -1))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_",title,".pdf"))
## quartz_off_screen 
##                 2
# z-score
p = make_pheatmap(data = data_z, 
                  cluster_cols = F, 
                  main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data),
                  color = colorRampPalette(c("navy", "white", "firebrick3"))(100))

                  #color = viridis::magma(100, direction = -1))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_zscore_imputed_",title,".pdf"))
## quartz_off_screen 
##                 2
# without grouping, 100 most variable lipids
d = assay(lipidomics_data_plasma$se_filt_impute)
d2 = head(order(rowVars(d),decreasing = T),100)
data = d[d2,]
data_z = t(scale(t(data),center = TRUE,scale = TRUE))
data_z[is.na(data_z)] <- 0

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Class) %>%
  filter(Name %in% names_lipids)
idx <- match(names_lipids, lipids$Name)
keep <- !is.na(idx)
lipids <- lipids[idx[keep], ]
data   <- data[keep, , drop = FALSE]
annotation_row = data.frame(lipid_type = as.factor(lipids$Class))
rownames(annotation_row) = lipids$Name

# order data
row_order <- order(annotation_row$lipid_type)  
data          <- data[row_order, col_order]
data_z <- data_z[row_order, col_order]
annotation_row <- annotation_row[row_order, , drop = FALSE]

# expression data
p = make_pheatmap(data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
                  labels_col = colnames(d),
                  #color = colorRampPalette(c("navy", "white", "firebrick3"))(100))
                  color = viridis::magma(100, direction = -1))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_mostvar_",title,".pdf"))
## quartz_off_screen 
##                 2
# z-score
p = make_pheatmap(data_z, 
                  cluster_cols = F, 
                  main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
                  labels_col = colnames(d),
                  color = colorRampPalette(c("navy", "white", "firebrick3"))(100))

                  #color = viridis::magma(100, direction = -1))
save_pheatmap_pdf(p, filename = paste0("plots/heatmap_zscore_imputed_mostvar_",title,".pdf"))
## quartz_off_screen 
##                 2
## UMAP (with participant status type)
d_plasma = t(assay(lipidomics_data_plasma$se_filt_impute))
labels_group = tube_batch_plasma$type
group = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4","orange3")
UMAP_density_plot(data = d_plasma,
                          ggtitle = paste0("UMAP with type labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_type_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_type_labels_",title,".pdf"),
                          colour_set = group)

# UMAP (with batch info)
labels_group = tube_batch_plasma$Batch
group = c("#3DB7E4", "#FF8849", "#69BE28","#757083","#CF7684","#A0C8C0","#CDAE3B")
UMAP_density_plot(data = d_plasma,
                          ggtitle = paste0("UMAP with batch labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_batch_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_batch_labels_",title,".pdf"),
                          colour_set = group)

  • CSF and Plasma together
 proteins_in_both_fluids = colnames(d_CSF)[colnames(d_CSF) %in% colnames(d_plasma)]
  
  d_1 = d_CSF[,proteins_in_both_fluids]
  d_2 = d_plasma[,proteins_in_both_fluids]
  
  d = do.call("rbind",list(d_1,d_2))
  
  labels_group = c(rep("CSF", nrow(d_1)), rep("Plasma", nrow(d_2)))
  title = "plasma_vs_CSF"

# UMAP of both biofluids
UMAP_density_plot(data = d,
                  ggtitle = paste0("UMAP with fluid labels\n", title),
                  legend_name = "Fluid labels",
                  labels = labels_group,
                  file_location = paste0("plots/UMAP_fluid_group_lipids_",title,".pdf"),
                  file_location_labels=paste0("plots/UMAP_fluid_group_labels_",title,".pdf"),
                  colour_set = group)

# UMAP of both biofluids - batch corrected

batch_CSF = tube_batch_CSF$Batch
batch_plasma = tube_batch_plasma$Batch
batch = c(batch_CSF,batch_plasma)
fluid = labels_group
design = model.matrix(~fluid)

d_batch_corrected <- limma::removeBatchEffect(t(d), batch = batch, design = design)
d_batch_corrected <- t(d_batch_corrected)

UMAP_density_plot(data = d_batch_corrected,
                  ggtitle = paste0("UMAP with fluid labels (batch corrected)\n", title),
                  legend_name = "Fluid labels",
                  labels = labels_group,
                  file_location = paste0("plots/UMAP_fluid_group_lipids_batch_corrected",title,".pdf"),
                  file_location_labels=paste0("plots/UMAP_fluid_group_labels_batch_corrected",title,".pdf"),
                  colour_set = group)

Step 8: differential expression analyses

The eighth step is to perform differential expression analyses, with particular interest of eALS vs CTR, PGMC vs CTR. For this, the information between IDs and the status (eALS, CTR, PGMC, mimic) have to be collected from other datasets

  • CSF
# ALS vs CTR
p = volcano_plot_contrast(res_CSF_ALS_CTR,
                      log2fc_col = "logFC_ALS_CTR",
                      padj_col   = "adj.P.Val_ALS_CTR",
                      group_pos  = "ALS",
                      group_neg  = "CTR",
                      title      = "ALS vs CTR (FDR = 10%)",
                      label_col = "name",
                      alpha      = 0.1,      
                      add_x      = 0.75, add_y = 0.05)
p

pdf("plots/DE_CSF_ALS_CTR_FDR10.pdf",width = 12,height = 8)
p
dev.off()
## quartz_off_screen 
##                 2
# PGMC vs CTR
p = volcano_plot_contrast(res_CSF_PGMC_CTR,
                      log2fc_col = "logFC_PGMC_CTR",
                      padj_col   = "adj.P.Val_PGMC_CTR",
                      group_pos  = "PGMC",
                      group_neg  = "CTR",
                      title      = "PGMC vs CTR (FDR = 10%)",
                      label_col = "name",
                      alpha      = 0.1,      
                      add_x      = 0.7, add_y = 0.05)
p

pdf("plots/DE_CSF_PGMC_CTR_FDR10.pdf",width = 12,height = 8)
p
dev.off()
## quartz_off_screen 
##                 2
  • Plasma
# ALS vs CTR
p = volcano_plot_contrast(res_plasma_ALS_CTR,
                      log2fc_col = "logFC_ALS_CTR",
                      padj_col   = "adj.P.Val_ALS_CTR",
                      group_pos  = "ALS",
                      group_neg  = "CTR",
                      title      = "ALS vs CTR (FDR = 5%)",
                      label_col = "name",
                      alpha      = 0.05,      
                      add_x      = 0.4, add_y = 0.1)
p

pdf("plots/DE_plasma_ALS_CTR_FDR5.pdf",width = 12,height = 8)
p
dev.off()
## quartz_off_screen 
##                 2
p = volcano_plot_contrast(res_plasma_ALS_CTR,
                      log2fc_col = "logFC_ALS_CTR",
                      padj_col   = "adj.P.Val_ALS_CTR",
                      group_pos  = "ALS",
                      group_neg  = "CTR",
                      title      = "ALS vs CTR (FDR = 10%)",
                      label_col = "name",
                      alpha      = 0.1,      
                      add_x      = 0.5, add_y = 0.1)
p

pdf("plots/DE_plasma_ALS_CTR_FDR10.pdf",width = 12,height = 8)
p
dev.off()
## quartz_off_screen 
##                 2
# PGMC vs CTR
p = volcano_plot_contrast(res_plasma_PGMC_CTR,
                      log2fc_col = "logFC_PGMC_CTR",
                      padj_col   = "adj.P.Val_PGMC_CTR",
                      group_pos  = "PGMC",
                      group_neg  = "CTR",
                      title      = "PGMC vs CTR (FDR = 5%)",
                      label_col = "name",
                      alpha      = 0.05,      
                      add_x      = 0.8, add_y = 0.1)
p

pdf("plots/DE_plasma_PGMC_CTR_FDR5.pdf",width = 12,height = 8)
p
dev.off()
## quartz_off_screen 
##                 2
p = volcano_plot_contrast(res_plasma_PGMC_CTR,
                      log2fc_col = "logFC_PGMC_CTR",
                      padj_col   = "adj.P.Val_PGMC_CTR",
                      group_pos  = "PGMC",
                      group_neg  = "CTR",
                      title      = "PGMC vs CTR (FDR = 10%)",
                      label_col = "name",
                      alpha      = 0.1,      
                      add_x      = 0.7, add_y = 0.1)
p

pdf("plots/DE_plasma_PGMC_CTR_FDR10.pdf",width = 12,height = 8)
p
dev.off()
## quartz_off_screen 
##                 2